Connect to Spark
sc <- spark_connect(master = "local")
* Using Spark: 3.1.1
- Read in the reddit dataset correctly and provide evidence of your code.
# read the dataset
reddit <- read.csv("RS_2017-09_filtered70.csv")
reddit <- as_tibble(reddit)
reddit
# data selection
reddit.selection <- select(reddit, brand_safe, num_crossposts, over_18, is_self, is_reddit_media_domain, is_video, stickied, spoiler)
# convert the brand_safe variable to 1 and 0
reddit.selection <- reddit.selection %>%
mutate(brand_safe=as.logical(as.logical(brand_safe))) %>%
mutate(over_18=as.logical(as.logical(over_18))) %>%
mutate(is_self=as.logical(as.logical(is_self))) %>%
mutate(is_reddit_media_domain=as.logical(as.logical(is_reddit_media_domain))) %>%
mutate(is_video=as.logical(as.logical(is_video))) %>%
mutate(stickied=as.logical(as.logical(stickied))) %>%
mutate(spoiler=as.logical(as.logical(spoiler)))
# data summary
summary(reddit.selection)
brand_safe num_crossposts over_18 is_self is_reddit_media_domain is_video
Mode :logical Min. :0.0000000 Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:65141 1st Qu.:0.0000000 FALSE:127226 FALSE:78627 FALSE:122255 FALSE:139379
TRUE :74682 Median :0.0000000 TRUE :12597 TRUE :61196 TRUE :17568 TRUE :444
Mean :0.0007509
3rd Qu.:0.0000000
Max. :3.0000000
stickied spoiler
Mode :logical Mode :logical
FALSE:139724 FALSE:138983
TRUE :99 TRUE :840
- Build a classifier based on the six factors and provide the classifier information. Provide the correct analysis about the factors.
# copy the dataset to spark
reddit_tbl <- copy_to(sc, reddit.selection, "reddit", overwrite=TRUE)
# split the data in training and testing data sets
partitions <- reddit_tbl %>%
na.omit() %>%
sdf_random_split(training=0.7, test=0.3, seed=42)
* No rows dropped by 'na.omit' call
train_tbl <- partitions$train
test_tbl <- partitions$test
# select the factors to include in the model
factors <- c("over_18", "is_self", "is_reddit_media_domain", "is_video", "stickied", "spoiler")
formula <- as.formula(paste("brand_safe ~ ", paste(factors, collapse="+")))
test_model <- function(title, model, formula, data) {
# build the model
ml_model <- data %>%
model(formula)
# perform the predictions
predictions <- ml_predict(ml_model, data)
# print the AUC
print(paste(
title,
ml_binary_classification_evaluator(predictions, metric_name = "areaUnderROC")))
}
test_model("ml_logistic_regression", ml_logistic_regression, formula, train_tbl)
[1] "ml_logistic_regression 0.700856558338718"
test_model("ml_random_forest", ml_random_forest, formula, train_tbl)
[1] "ml_random_forest 0.700699574974018"
test_model("ml_gradient_boosted_trees", ml_gradient_boosted_trees, formula, train_tbl)
[1] "ml_gradient_boosted_trees 0.701015105927723"
# create the model
ml_model <- train_tbl %>%
ml_gradient_boosted_trees(formula)
# perform the predictions
predictions <- ml_predict(ml_model, train_tbl)
head(predictions)
predictions$p1 <- unlist(pred_lr$probability)[ c(FALSE,TRUE) ]
Error in unlist(pred_lr$probability) : object 'pred_lr' not found
#ml_log <- train_tbl %>%
#ml_random_forest(formula)
#ml_gradient_boosted_trees(formula)
#ml_logistic_regression(formula)
#pred.train <- ml_predict(ml_log, train_tbl)
#pred.test <- ml_predict(ml_log, test_tbl)
#ml_binary_classification_evaluator(pred.train, metric_name = "areaUnderROC")
#ml_binary_classification_evaluator(pred.test, metric_name = "areaUnderROC")
# perform the predictions
#pred_lr <- pred.train %>% collect
#pred_lr$p1 <- unlist(pred_lr$probability)[ c(FALSE,TRUE) ]
# calculate the roc values
#ROC_lr <- get_roc(L = pred_lr$late_arrival, f = pred_lr$p1)
# plot the roc curve
#ggplot(ROC_lr, aes(x = FPR, y = TPR)) + geom_line(aes(col = "Model Prediction"))
LS0tCnRpdGxlOiAiUHJvamVjdCBUYXNrIDIgLSBzcGFya2x5ciIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBJbXBvcnQgTGlicmFyaWVzCgpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkocFJPQykKbGlicmFyeShzcGFya2x5cikKYGBgCiMgSGVscGVyIEZ1bmN0aW9ucwoKYGBge3J9CiMnIFJPQyBjdXJ2ZSBjb2RlCiMnCiMnIEJhc2VkIG9uIGFsZ28gMSBwYWdlIDg2NiwgRmF3Y2V0dDIwMDUKIycKIycgQHBhcmFtIEwgb2JzZXJ2YXRpb25zIAojJyBAcGFyYW0gZiwgcHJlZGljdGVkIHByb2IuCiMnCiMnIEByZXR1cm4gcG9pbnRzIGluIFJPQyBzcGFjZSBhbmQgc2NvcmUKZ2V0X3JvYyA8LSBmdW5jdGlvbihMLCBmKSB7CiAgIyBDYWxjdWxhdGUgUCBhbmQgTgogIFAgPC0gc3VtKEw9PTEpCiAgTiA8LSBzdW0oTD09MCkKICAjIE9yZGVyIHRoZSBvYnNlcnZhdGlvbnMgYnkgcHJlZGljdGlvbgogIGRmICA8LSB0aWJibGUoTCwgZikKICBkZiA8LSBkZiAlPiUgYXJyYW5nZShkZXNjKGYpKQogICMgU2V0IFRQIGFuZCBGUCB0byB6ZXJvCiAgVFAgPC0gMAogIEZQIDwtIDAKICAjIFNldCB1cCBtYXRyaXggZm9yIHJlc3VsdHMKICBSIDwtIE5VTEwKICAjIFNldCBwcmV2aW91cyBmCiAgZl9wcmV2IDwtIC1JbmYKICAjIHNldCBjb3VudGVyCiAgaSA8LSAxCiAgd2hpbGUoaSA8PSBsZW5ndGgoZGYkTCkpewogICAgaWYoIGRmJGZbaV0gIT0gZl9wcmV2KXsKICAgICAgUiA8LSByYmluZChSLCBjKEZQL04sIFRQL1AsIGRmJGZbaV0pKQogICAgICBmX3ByZXYgPC0gZGYkZltpXQogICAgfQogICAgaWYoZGYkTFtpXSA9PSAxKXsKICAgICAgVFAgPC0gVFAgKyAxCiAgICB9IGVsc2UgewogICAgICBGUCA8LSBGUCArIDEKICAgIH0KICAgIGkgPC0gaSArIDEKICB9CiAgUiA8LSByYmluZChSLCBjKEZQL04sIFRQL1AsIGZfcHJldikpCiAgUiA8LSBkYXRhLmZyYW1lKFIpCiAgY29sbmFtZXMoUikgPC0gYygiRlBSIiwiVFBSIiwgIlNjb3JlIikKICByZXR1cm4oUikKfQpgYGAKCgojIENvbm5lY3QgdG8gU3BhcmsKCmBgYHtyfQpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJsb2NhbCIpCmBgYAoKMS4JUmVhZCBpbiB0aGUgcmVkZGl0IGRhdGFzZXQgY29ycmVjdGx5IGFuZCBwcm92aWRlIGV2aWRlbmNlIG9mIHlvdXIgY29kZS4gCgpgYGB7cn0KIyByZWFkIHRoZSBkYXRhc2V0CnJlZGRpdCA8LSByZWFkLmNzdigiUlNfMjAxNy0wOV9maWx0ZXJlZDcwLmNzdiIpCnJlZGRpdCA8LSBhc190aWJibGUocmVkZGl0KQpyZWRkaXQKYGBgCgpgYGB7cn0KIyBkYXRhIHNlbGVjdGlvbgpyZWRkaXQuc2VsZWN0aW9uIDwtIHNlbGVjdChyZWRkaXQsIGJyYW5kX3NhZmUsIG51bV9jcm9zc3Bvc3RzLCBvdmVyXzE4LCBpc19zZWxmLCBpc19yZWRkaXRfbWVkaWFfZG9tYWluLCBpc192aWRlbywgc3RpY2tpZWQsIHNwb2lsZXIpCgojIGNvbnZlcnQgdGhlIGJyYW5kX3NhZmUgdmFyaWFibGUgdG8gMSBhbmQgMApyZWRkaXQuc2VsZWN0aW9uIDwtIHJlZGRpdC5zZWxlY3Rpb24gJT4lIAogIG11dGF0ZShicmFuZF9zYWZlPWFzLmxvZ2ljYWwoYXMubG9naWNhbChicmFuZF9zYWZlKSkpICU+JQogIG11dGF0ZShvdmVyXzE4PWFzLmxvZ2ljYWwoYXMubG9naWNhbChvdmVyXzE4KSkpICU+JQogIG11dGF0ZShpc19zZWxmPWFzLmxvZ2ljYWwoYXMubG9naWNhbChpc19zZWxmKSkpICU+JQogIG11dGF0ZShpc19yZWRkaXRfbWVkaWFfZG9tYWluPWFzLmxvZ2ljYWwoYXMubG9naWNhbChpc19yZWRkaXRfbWVkaWFfZG9tYWluKSkpICU+JQogIG11dGF0ZShpc192aWRlbz1hcy5sb2dpY2FsKGFzLmxvZ2ljYWwoaXNfdmlkZW8pKSkgJT4lCiAgbXV0YXRlKHN0aWNraWVkPWFzLmxvZ2ljYWwoYXMubG9naWNhbChzdGlja2llZCkpKSAlPiUKICBtdXRhdGUoc3BvaWxlcj1hcy5sb2dpY2FsKGFzLmxvZ2ljYWwoc3BvaWxlcikpKQoKIyBkYXRhIHN1bW1hcnkKc3VtbWFyeShyZWRkaXQuc2VsZWN0aW9uKQpgYGAKMi4JQnVpbGQgYSBjbGFzc2lmaWVyIGJhc2VkIG9uIHRoZSBzaXggZmFjdG9ycyBhbmQgcHJvdmlkZSB0aGUgY2xhc3NpZmllciBpbmZvcm1hdGlvbi4gUHJvdmlkZSB0aGUgY29ycmVjdCBhbmFseXNpcyBhYm91dCB0aGUgZmFjdG9ycy4gCgoKYGBge3J9CiMgY29weSB0aGUgZGF0YXNldCB0byBzcGFyawpyZWRkaXRfdGJsIDwtIGNvcHlfdG8oc2MsIHJlZGRpdC5zZWxlY3Rpb24sICJyZWRkaXQiLCBvdmVyd3JpdGU9VFJVRSkKCiMgc3BsaXQgdGhlIGRhdGEgaW4gdHJhaW5pbmcgYW5kIHRlc3RpbmcgZGF0YSBzZXRzCnBhcnRpdGlvbnMgPC0gcmVkZGl0X3RibCAlPiUKICBuYS5vbWl0KCkgJT4lCiAgc2RmX3JhbmRvbV9zcGxpdCh0cmFpbmluZz0wLjcsIHRlc3Q9MC4zLCBzZWVkPTQyKQoKdHJhaW5fdGJsIDwtIHBhcnRpdGlvbnMkdHJhaW4KdGVzdF90YmwgPC0gcGFydGl0aW9ucyR0ZXN0CmBgYApgYGB7cn0KIyBzZWxlY3QgdGhlIGZhY3RvcnMgdG8gaW5jbHVkZSBpbiB0aGUgbW9kZWwKZmFjdG9ycyA8LSBjKCJvdmVyXzE4IiwgImlzX3NlbGYiLCAiaXNfcmVkZGl0X21lZGlhX2RvbWFpbiIsICJpc192aWRlbyIsICJzdGlja2llZCIsICJzcG9pbGVyIikKZm9ybXVsYSA8LSBhcy5mb3JtdWxhKHBhc3RlKCJicmFuZF9zYWZlIH4gIiwgcGFzdGUoZmFjdG9ycywgY29sbGFwc2U9IisiKSkpCmBgYAoKYGBge3J9CnRlc3RfbW9kZWwgPC0gZnVuY3Rpb24odGl0bGUsIG1vZGVsLCBmb3JtdWxhLCBkYXRhKSB7CiAgIyBidWlsZCB0aGUgbW9kZWwKICBtbF9tb2RlbCA8LSBkYXRhICU+JQogICAgbW9kZWwoZm9ybXVsYSkKCiAgIyBwZXJmb3JtIHRoZSBwcmVkaWN0aW9ucwogIHByZWRpY3Rpb25zIDwtIG1sX3ByZWRpY3QobWxfbW9kZWwsIGRhdGEpCgogICMgcHJpbnQgdGhlIEFVQwogIHByaW50KHBhc3RlKAogICAgdGl0bGUsCiAgICBtbF9iaW5hcnlfY2xhc3NpZmljYXRpb25fZXZhbHVhdG9yKHByZWRpY3Rpb25zLCBtZXRyaWNfbmFtZSA9ICJhcmVhVW5kZXJST0MiKSkpCn0KCnRlc3RfbW9kZWwoIm1sX2xvZ2lzdGljX3JlZ3Jlc3Npb24iLCBtbF9sb2dpc3RpY19yZWdyZXNzaW9uLCBmb3JtdWxhLCB0cmFpbl90YmwpCnRlc3RfbW9kZWwoIm1sX3JhbmRvbV9mb3Jlc3QiLCBtbF9yYW5kb21fZm9yZXN0LCBmb3JtdWxhLCB0cmFpbl90YmwpCnRlc3RfbW9kZWwoIm1sX2dyYWRpZW50X2Jvb3N0ZWRfdHJlZXMiLCBtbF9ncmFkaWVudF9ib29zdGVkX3RyZWVzLCBmb3JtdWxhLCB0cmFpbl90YmwpCmBgYApgYGB7cn0KIyBjcmVhdGUgdGhlIG1vZGVsCm1sX21vZGVsIDwtIHRyYWluX3RibCAlPiUKICBtbF9ncmFkaWVudF9ib29zdGVkX3RyZWVzKGZvcm11bGEpCgojIHBlcmZvcm0gdGhlIHByZWRpY3Rpb25zCnByZWRpY3Rpb25zIDwtIG1sX3ByZWRpY3QobWxfbW9kZWwsIHRyYWluX3RibCkKCmhlYWQocHJlZGljdGlvbnMpCmBgYAoKYGBge3J9CiAgcHJlZGljdGlvbnMkcDEgPC0gdW5saXN0KHByZWRpY3Rpb25zJHByb2JhYmlsaXR5KVsgYyhGQUxTRSxUUlVFKSBdCiAgCiAgIyBjYWxjdWxhdGUgdGhlIHJvYyB2YWx1ZXMKICAjcmV0dXJuIChnZXRfcm9jKEwgPSBwcmVkX2xyJGxhdGVfYXJyaXZhbCwgZiA9IHByZWRfbHIkcDEpKQpgYGAKCgpgYGB7cn0KI21sX2xvZyA8LSB0cmFpbl90YmwgJT4lCiAgI21sX3JhbmRvbV9mb3Jlc3QoZm9ybXVsYSkKICAjbWxfZ3JhZGllbnRfYm9vc3RlZF90cmVlcyhmb3JtdWxhKQogICNtbF9sb2dpc3RpY19yZWdyZXNzaW9uKGZvcm11bGEpCgojcHJlZC50cmFpbiA8LSBtbF9wcmVkaWN0KG1sX2xvZywgdHJhaW5fdGJsKQojcHJlZC50ZXN0IDwtIG1sX3ByZWRpY3QobWxfbG9nLCB0ZXN0X3RibCkKCiNtbF9iaW5hcnlfY2xhc3NpZmljYXRpb25fZXZhbHVhdG9yKHByZWQudHJhaW4sIG1ldHJpY19uYW1lID0gImFyZWFVbmRlclJPQyIpCiNtbF9iaW5hcnlfY2xhc3NpZmljYXRpb25fZXZhbHVhdG9yKHByZWQudGVzdCwgbWV0cmljX25hbWUgPSAiYXJlYVVuZGVyUk9DIikKYGBgCmBgYHtyfQojIHBlcmZvcm0gdGhlIHByZWRpY3Rpb25zCiNwcmVkX2xyIDwtIHByZWQudHJhaW4gJT4lIGNvbGxlY3QKI3ByZWRfbHIkcDEgPC0gdW5saXN0KHByZWRfbHIkcHJvYmFiaWxpdHkpWyBjKEZBTFNFLFRSVUUpIF0KCiMgY2FsY3VsYXRlIHRoZSByb2MgdmFsdWVzCiNST0NfbHIgPC0gZ2V0X3JvYyhMID0gcHJlZF9sciRsYXRlX2Fycml2YWwsIGYgPSBwcmVkX2xyJHAxKQogIAojIHBsb3QgdGhlIHJvYyBjdXJ2ZQojZ2dwbG90KFJPQ19sciwgYWVzKHggPSBGUFIsIHkgPSBUUFIpKSArIGdlb21fbGluZShhZXMoY29sID0gIk1vZGVsIFByZWRpY3Rpb24iKSkKYGBgCgo=